library(ggplot2) # graphics library
library(MASS) # contains data sets, including Boston
library(ISLR) # contains code and data from the textbook
library(knitr) # contains kable() function
library(gridExtra) #Miscellaneous Functions for "Grid" Graphics
library(dplyr) # contains useful functions for data manipulation
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:gridExtra':
##
## combine
## The following object is masked from 'package:MASS':
##
## select
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(jtools) # Analysis and Presentation of Social Scientific Data
library(magrittr) # A Forward-Pipe Operator for R: %>%
library(kableExtra) # For constructing complex tables
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
library(ggpubr) # 'ggplot2' Based Publication Ready Plots
## Registered S3 methods overwritten by 'broom':
## method from
## tidy.glht jtools
## tidy.summary.glht jtools
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-1
library(leaps) # needed for regsubsets
library(boot) # needed for cv.glm
library(gridExtra)
library(ggcorrplot)
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(rpart)
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:gridExtra':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(gbm)
## Loaded gbm 2.1.8
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(rpart)
library(partykit)
## Loading required package: grid
## Loading required package: libcoin
## Loading required package: mvtnorm
options(scipen = 4) # Suppresses scientific notation
In this project, we intend to address the issues of flight delays to improve customer satisfaction especially for individuals (also potential customers) who receive travelers at the arrival airport. These uncertainties create a lot of dissatisfaction for passengers as well as those receiving them at the arrival airport and hence damage the overall reputation of an airline. For this analysis, we used 2006 dataset where we used 46,666 rows (with arrival destination, PIT International Airport only) to train our model. Later, we use 20% of the 2018-2019 dataset to test our model.
We believe that 2006 dataset would be an appropriate fit for our test model which would then help us understand how the flight delays have evolved over the years as well move forward to analyze changes between 2006 and 2018-19 dataset.
all_PIT_2006 <- read.csv(file = "all_PIT_2006.csv", header = TRUE)
head(all_PIT_2006)
## Year Quarter Month AirlineID UniqueCarrier Carrier FlightDate DayofMonth
## 1 2006 1 1 20355 US US 1/11/2006 11
## 2 2006 1 1 20355 US US 1/11/2006 11
## 3 2006 1 1 20355 US US 1/11/2006 11
## 4 2006 1 1 20355 US US 1/11/2006 11
## 5 2006 1 1 20355 US US 1/11/2006 11
## 6 2006 1 1 20355 US US 1/11/2006 11
## DayOfWeek Flights FlightNum TailNum ActualElapsedTime CRSElapsedTime AirTime
## 1 3 1 1607 N813MA 96 90 77
## 2 3 1 135 N114UW 98 109 81
## 3 3 1 139 N747UW 108 111 83
## 4 3 1 559 N433US 111 105 85
## 5 3 1 190 N745UW 73 85 59
## 6 3 1 258 N420US 74 81 57
## ArrDel15 ArrDel30 ArrDelSys15 ArrDelSys30 ArrDelay ArrTime ArrTimeBlk
## 1 0 0 0 0 1 1651 1600-1659
## 2 0 0 0 0 -7 1722 1700-1759
## 3 1 0 1 0 16 2012 1900-1959
## 4 0 0 0 0 0 750 0700-0759
## 5 0 0 0 0 -13 2242 2200-2259
## 6 0 0 0 0 -4 857 0900-0959
## CRSArrTime DepDel15 DepDel30 DepDelSys15 DepDelSys30 DepDelay DepTime
## 1 1650 0 0 0 0 -5 1515
## 2 1729 0 0 0 0 4 1544
## 3 1956 1 0 1 0 19 1824
## 4 750 0 0 0 0 -6 559
## 5 2255 0 0 0 0 -1 2129
## 6 901 0 0 0 0 3 743
## DepTimeBlk CRSDepTime Origin OriginCityName OriginState OriginStateFips
## 1 1500-1559 1520 BDL Hartford CT 9
## 2 1500-1559 1540 BOS Boston MA 25
## 3 1800-1859 1805 BOS Boston MA 25
## 4 0600-0659 605 BOS Boston MA 25
## 5 2100-2159 2130 CLT Charlotte NC 37
## 6 0700-0759 740 CLT Charlotte NC 37
## OriginStateName OriginWac Dest DestCityName DestState DestStateFips
## 1 Connecticut 11 PIT Pittsburgh PA 42
## 2 Massachusetts 13 PIT Pittsburgh PA 42
## 3 Massachusetts 13 PIT Pittsburgh PA 42
## 4 Massachusetts 13 PIT Pittsburgh PA 42
## 5 North Carolina 36 PIT Pittsburgh PA 42
## 6 North Carolina 36 PIT Pittsburgh PA 42
## DestStateName DestWac Distance DistanceGroup TaxiIn TaxiOut WheelsOff
## 1 Pennsylvania 23 406 2 5 14 1529
## 2 Pennsylvania 23 496 2 3 14 1558
## 3 Pennsylvania 23 496 2 5 20 1844
## 4 Pennsylvania 23 496 2 5 21 620
## 5 Pennsylvania 23 366 2 4 10 2139
## 6 Pennsylvania 23 366 2 5 12 755
## WheelsOn Cancelled CancellationCode Diverted CarrierDelay WeatherDelay
## 1 1646 0 0 0 0
## 2 1719 0 0 0 0
## 3 2007 0 0 0 0
## 4 745 0 0 0 0
## 5 2238 0 0 0 0
## 6 852 0 0 0 0
## NASDelay SecurityDelay LateAircraftDelay
## 1 0 0 0
## 2 0 0 0
## 3 0 0 16
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
all_PIT_201803_201902 <- read.csv(file = "all_PIT_201803_201902.csv", header = TRUE)
head(all_PIT_201803_201902)
## Year Quarter Month DayofMonth DayOfWeek FlightDate Reporting_Airline
## 1 2018 1 3 1 4 3/1/2018 9E
## 2 2018 1 3 1 4 3/1/2018 9E
## 3 2018 1 3 1 4 3/1/2018 9E
## 4 2018 1 3 1 4 3/1/2018 9E
## 5 2018 1 3 1 4 3/1/2018 9E
## 6 2018 1 3 1 4 3/1/2018 9E
## DOT_ID_Reporting_Airline IATA_CODE_Reporting_Airline Tail_Number
## 1 20363 9E N902XJ
## 2 20363 9E N272PQ
## 3 20363 9E N309PQ
## 4 20363 9E N293PQ
## 5 20363 9E N272PQ
## 6 20363 9E N200PQ
## Flight_Number_Reporting_Airline OriginAirportID OriginAirportSeqID
## 1 3359 14122 1412202
## 2 3427 11433 1143302
## 3 3464 14122 1412202
## 4 3490 14122 1412202
## 5 3499 14122 1412202
## 6 3501 14122 1412202
## OriginCityMarketID Origin OriginCityName OriginState OriginStateFips
## 1 30198 PIT Pittsburgh, PA PA 42
## 2 31295 DTW Detroit, MI MI 26
## 3 30198 PIT Pittsburgh, PA PA 42
## 4 30198 PIT Pittsburgh, PA PA 42
## 5 30198 PIT Pittsburgh, PA PA 42
## 6 30198 PIT Pittsburgh, PA PA 42
## OriginStateName OriginWac DestAirportID DestAirportSeqID DestCityMarketID
## 1 Pennsylvania 23 13487 1348702 31650
## 2 Michigan 43 14122 1412202 30198
## 3 Pennsylvania 23 12478 1247805 31703
## 4 Pennsylvania 23 11433 1143302 31295
## 5 Pennsylvania 23 11433 1143302 31295
## 6 Pennsylvania 23 11433 1143302 31295
## Dest DestCityName DestState DestStateFips DestStateName DestWac CRSDepTime
## 1 MSP Minneapolis, MN MN 27 Minnesota 63 1730
## 2 PIT Pittsburgh, PA PA 42 Pennsylvania 23 1530
## 3 JFK New York, NY NY 36 New York 22 600
## 4 DTW Detroit, MI MI 26 Michigan 43 1300
## 5 DTW Detroit, MI MI 26 Michigan 43 1736
## 6 DTW Detroit, MI MI 26 Michigan 43 956
## DepTime DepDelay DepDelayMinutes DepDel15 DepartureDelayGroups DepTimeBlk
## 1 1730 NA NA NA NA 1700-1759
## 2 1536 6 6 0 0 1500-1559
## 3 600 NA NA NA NA 0600-0659
## 4 1253 -7 0 0 -1 1300-1359
## 5 2023 167 167 1 11 1700-1759
## 6 953 -3 0 0 -1 0900-0959
## TaxiOut WheelsOff WheelsOn TaxiIn CRSArrTime ArrTime ArrDelay ArrDelayMinutes
## 1 17 1747 1838 3 1856 1841 -15 0
## 2 84 1700 1736 7 1645 1743 58 58
## 3 13 613 711 5 750 716 -34 0
## 4 17 1310 1358 21 1411 1419 8 8
## 5 23 2046 2129 20 1859 2149 170 170
## 6 19 1012 1101 6 1119 1107 -12 0
## ArrDel15 ArrivalDelayGroups ArrTimeBlk Cancelled CancellationCode Diverted
## 1 0 -1 1800-1859 0 <NA> 0
## 2 1 3 1600-1659 0 <NA> 0
## 3 0 -2 0700-0759 0 <NA> 0
## 4 0 0 1400-1459 0 <NA> 0
## 5 1 11 1800-1859 0 <NA> 0
## 6 0 -1 1100-1159 0 <NA> 0
## CRSElapsedTime ActualElapsedTime AirTime Flights Distance DistanceGroup
## 1 146 131 111 1 726 3
## 2 75 127 36 1 201 1
## 3 110 76 58 1 340 2
## 4 71 86 48 1 201 1
## 5 83 86 43 1 201 1
## 6 83 74 49 1 201 1
## CarrierDelay WeatherDelay NASDelay SecurityDelay LateAircraftDelay
## 1 NA NA NA NA NA
## 2 0 3 52 0 3
## 3 NA NA NA NA NA
## 4 NA NA NA NA NA
## 5 0 0 128 0 42
## 6 NA NA NA NA NA
## FirstDepTime TotalAddGTime LongestAddGTime
## 1 NA NA NA
## 2 NA NA NA
## 3 NA NA NA
## 4 NA NA NA
## 5 NA NA NA
## 6 NA NA NA
First, we begin by cleaning our dataset. For NA values, we decided to first run a count of total NA values and after assessing its overall proportion, we dropped them from our dataset. This was to avoid any problems in our analysis and model training. We also assess the class of each of each variable within all_PIT_2006_clean dataset to understand the nature/state of each variable before we run any analysis.
totalNA <- sum(is.na(all_PIT_2006))
all_PIT_2006_clean <- na.omit(all_PIT_2006) # Remove NA rows
# sum(is.na(all_PIT_2006_clean)) # To check no NA values remain
head(all_PIT_2006_clean)
## Year Quarter Month AirlineID UniqueCarrier Carrier FlightDate DayofMonth
## 1 2006 1 1 20355 US US 1/11/2006 11
## 2 2006 1 1 20355 US US 1/11/2006 11
## 3 2006 1 1 20355 US US 1/11/2006 11
## 4 2006 1 1 20355 US US 1/11/2006 11
## 5 2006 1 1 20355 US US 1/11/2006 11
## 6 2006 1 1 20355 US US 1/11/2006 11
## DayOfWeek Flights FlightNum TailNum ActualElapsedTime CRSElapsedTime AirTime
## 1 3 1 1607 N813MA 96 90 77
## 2 3 1 135 N114UW 98 109 81
## 3 3 1 139 N747UW 108 111 83
## 4 3 1 559 N433US 111 105 85
## 5 3 1 190 N745UW 73 85 59
## 6 3 1 258 N420US 74 81 57
## ArrDel15 ArrDel30 ArrDelSys15 ArrDelSys30 ArrDelay ArrTime ArrTimeBlk
## 1 0 0 0 0 1 1651 1600-1659
## 2 0 0 0 0 -7 1722 1700-1759
## 3 1 0 1 0 16 2012 1900-1959
## 4 0 0 0 0 0 750 0700-0759
## 5 0 0 0 0 -13 2242 2200-2259
## 6 0 0 0 0 -4 857 0900-0959
## CRSArrTime DepDel15 DepDel30 DepDelSys15 DepDelSys30 DepDelay DepTime
## 1 1650 0 0 0 0 -5 1515
## 2 1729 0 0 0 0 4 1544
## 3 1956 1 0 1 0 19 1824
## 4 750 0 0 0 0 -6 559
## 5 2255 0 0 0 0 -1 2129
## 6 901 0 0 0 0 3 743
## DepTimeBlk CRSDepTime Origin OriginCityName OriginState OriginStateFips
## 1 1500-1559 1520 BDL Hartford CT 9
## 2 1500-1559 1540 BOS Boston MA 25
## 3 1800-1859 1805 BOS Boston MA 25
## 4 0600-0659 605 BOS Boston MA 25
## 5 2100-2159 2130 CLT Charlotte NC 37
## 6 0700-0759 740 CLT Charlotte NC 37
## OriginStateName OriginWac Dest DestCityName DestState DestStateFips
## 1 Connecticut 11 PIT Pittsburgh PA 42
## 2 Massachusetts 13 PIT Pittsburgh PA 42
## 3 Massachusetts 13 PIT Pittsburgh PA 42
## 4 Massachusetts 13 PIT Pittsburgh PA 42
## 5 North Carolina 36 PIT Pittsburgh PA 42
## 6 North Carolina 36 PIT Pittsburgh PA 42
## DestStateName DestWac Distance DistanceGroup TaxiIn TaxiOut WheelsOff
## 1 Pennsylvania 23 406 2 5 14 1529
## 2 Pennsylvania 23 496 2 3 14 1558
## 3 Pennsylvania 23 496 2 5 20 1844
## 4 Pennsylvania 23 496 2 5 21 620
## 5 Pennsylvania 23 366 2 4 10 2139
## 6 Pennsylvania 23 366 2 5 12 755
## WheelsOn Cancelled CancellationCode Diverted CarrierDelay WeatherDelay
## 1 1646 0 0 0 0
## 2 1719 0 0 0 0
## 3 2007 0 0 0 0
## 4 745 0 0 0 0
## 5 2238 0 0 0 0
## 6 852 0 0 0 0
## NASDelay SecurityDelay LateAircraftDelay
## 1 0 0 0
## 2 0 0 0
## 3 0 0 16
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
quarterVals <- unique(all_PIT_2006_clean$Quarter) # 1,2,3,4. To check if there were any values that should not have been there like 0 or 5,6,7..
monthVals <- unique(all_PIT_2006_clean$Month) # Only values from 1 to 12
DayOfMonth <- unique(all_PIT_2006_clean$DayofMonth) # Only values from 1 to 31
DayOfWeek <- unique(all_PIT_2006_clean$DayOfWeek) # Only values from 1 to 7
classes.PIT_06 <- data.frame(variabletype = sapply(all_PIT_2006_clean,class))
classes.PIT_06
## variabletype
## Year integer
## Quarter integer
## Month integer
## AirlineID integer
## UniqueCarrier character
## Carrier character
## FlightDate character
## DayofMonth integer
## DayOfWeek integer
## Flights integer
## FlightNum integer
## TailNum character
## ActualElapsedTime integer
## CRSElapsedTime integer
## AirTime integer
## ArrDel15 integer
## ArrDel30 integer
## ArrDelSys15 integer
## ArrDelSys30 integer
## ArrDelay integer
## ArrTime integer
## ArrTimeBlk character
## CRSArrTime integer
## DepDel15 integer
## DepDel30 integer
## DepDelSys15 integer
## DepDelSys30 integer
## DepDelay integer
## DepTime integer
## DepTimeBlk character
## CRSDepTime integer
## Origin character
## OriginCityName character
## OriginState character
## OriginStateFips integer
## OriginStateName character
## OriginWac integer
## Dest character
## DestCityName character
## DestState character
## DestStateFips integer
## DestStateName character
## DestWac integer
## Distance integer
## DistanceGroup integer
## TaxiIn integer
## TaxiOut integer
## WheelsOff integer
## WheelsOn integer
## Cancelled integer
## CancellationCode character
## Diverted integer
## CarrierDelay integer
## WeatherDelay integer
## NASDelay integer
## SecurityDelay integer
## LateAircraftDelay integer
1a. Carrier flight frequency: We can see from our plot that US carrier is the most frequently appearing carrier in our dataset followed by WN. This is helpful as we move forward with our analysis as we find patterns of flight delays based on carrier and other parameters. As we account for CarrierDelay, we se that US followed by MQ and then WN show the highest number of Carrier Delays.
1b. Calculate proportions via normalization: It is also important to understand that the number of flights vary for each carrier, hence, we should actually compare the proportion of total flights that are delayed instead of their exact numbers. We normalize our data and plot proportions accordingly.As we normalize and account for proportions, EV followed by UA give us the highest number of Carrier delays in our dataset.
ggplot(data = all_PIT_2006_clean,
aes(x = Carrier)) +
geom_histogram(stat = "count") # To check each carrier's flights
## Warning: Ignoring unknown parameters: binwidth, bins, pad
# Normalize data and plot proportions
delaysPerCarrier <- all_PIT_2006_clean %>%
group_by(Carrier) %>%
summarize(num.obs = n(),
Sum.of.CarDelay = sum(CarrierDelay > 0))
ggplot(data = delaysPerCarrier,
aes(x = Carrier,
y = Sum.of.CarDelay)) + geom_bar(stat = "identity")
# But some Carriers might have fewer flights so we should actually compare the proportion of total flights that got delayed
delaysPerCarrier <- all_PIT_2006_clean %>%
group_by(Carrier) %>%
summarize(num.obs = n(),
Prop.CarDelay = sum(CarrierDelay > 0) / n())
ggplot(data = delaysPerCarrier,
aes(x = Carrier,
y = Prop.CarDelay)) +
geom_bar(stat = "identity")
# Checking if delays had to do anything with the Origin City
OriginCity.CarrierDelay <- all_PIT_2006_clean %>%
group_by(OriginCityName, Carrier) %>%
summarize(total = n(),
delayedFlightNum = sum(CarrierDelay > 0),
delayedFlightProp = sum(CarrierDelay > 0) / n())
## `summarise()` has grouped output by 'OriginCityName'. You can override using the `.groups` argument.
OriginCity.CarrierDelay %>%
group_by(OriginCityName) %>%
summarize(totFlightsInCity = sum(total),
totDelayedFlights = sum(delayedFlightNum),
proportion = totDelayedFlights / totFlightsInCity) %>%
ggplot(aes(x = OriginCityName,
y = proportion)) +
geom_histogram(stat = "identity") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5), legend.position = 'none')
## Warning: Ignoring unknown parameters: binwidth, bins, pad
#check if delays had to do anything with the Destination City
DestCity.CarrierDelay <- all_PIT_2006_clean %>%
group_by(DestCityName, Carrier) %>%
summarize(total = n(),
delayedFlightNum = sum(CarrierDelay > 0),
delayedFlightProp = sum(CarrierDelay > 0) / n())
## `summarise()` has grouped output by 'DestCityName'. You can override using the `.groups` argument.
DestCity.CarrierDelay %>%
group_by(DestCityName) %>%
summarize(totFlightsInCity = sum(total),
totDelayedFlights = sum(delayedFlightNum),
proportion = totDelayedFlights / totFlightsInCity) %>%
ggplot(aes(x = DestCityName,
y = proportion)) +
geom_histogram(stat = "identity") +
theme(axis.text.x = element_text(angle = 45, vjust = 0.5), legend.position = 'none')
## Warning: Ignoring unknown parameters: binwidth, bins, pad
# Checking if delays (Carrier) had to do anything with the Month
OriginCity.MonthDelay <- all_PIT_2006_clean %>%
group_by(Month, Carrier) %>%
summarize(total = n(),
delayedFlightNum = sum(CarrierDelay > 0),
delayedFlightProp = sum(CarrierDelay > 0) / n())
## `summarise()` has grouped output by 'Month'. You can override using the `.groups` argument.
OriginCity.MonthDelay %>%
group_by(Month) %>%
summarize(totalFlightsInMonth = sum(total),
totalDelaysInMonth = sum(delayedFlightNum),
propInMonth = totalDelaysInMonth / totalFlightsInMonth) %>%
ggplot(aes(x = as.factor(Month),
y = propInMonth)) +
geom_histogram(stat = "identity")
## Warning: Ignoring unknown parameters: binwidth, bins, pad
# Checking if delays (weather) had to do anything with the Month
WeatherDelay <- all_PIT_2006_clean %>%
group_by(Month) %>%
summarize(total = n(),
delayedFlightNum = sum(WeatherDelay > 0),
delayedFlightProp = sum(WeatherDelay > 0) / n())
WeatherDelay %>%
group_by(Month) %>%
summarize(totalFlightsInMonth = sum(total),
totalDelaysInMonth = sum(delayedFlightNum),
propInMonth = totalDelaysInMonth / totalFlightsInMonth) %>%
ggplot(aes(x = as.factor(Month),
y = propInMonth)) +
geom_histogram(stat = "identity")
## Warning: Ignoring unknown parameters: binwidth, bins, pad
ArrDel15 as our response (dependent) variable as we work with the assumption that 15 minutes slot is a reasonable interval for to qualify for a flight arrival delay. We use this as a factor variable, i.e. it equals to 1 if the delay is of 15 minutes or more, and 0 if the delay is less than 15 minutes.Month: From our boxplot, we can see that whether ArrDel15 is 0 or 1, the median remains the same.Distance: From our boxplot, we can see that whether ArrDel15 is 0 or 1, the median remains the same.TaxiIn: From our boxplot, we can see that whether ArrDel15 is 0 or 1, the median remains the same.DepDelay: From our boxplot, we can see that when ArrDel15 is 1, the median is higher which means that when ArrDel15 is greater than 15minutes, DepDelay is higher too which does make intuitive sense.WheelsOff: From our boxplot, we can see that when ArrDel15 is 1, the median is higher which means that when ArrDel15 is greater than 15minutes, WheelsOff is at a later time which makes sense given the later the wheels are lifted from destination airport, the greater the arrival delay.TaxiOut: From our boxplot, we can see that when ArrDel15 is 1, the median is higher which means that when ArrDel15 is greater than 15minutes,TaxiOut is late which makes sense given the longer an airplane takes to taxi out at the destination airport, the greater the arrival delay.Quarter: From our boxplot, we can see that whether ArrDel15 is 0 or 1, the median remains the same.# Potential variable analysis
ggplot(data = all_PIT_2006_clean, aes(x = as.factor(ArrDel15), y = Month)) +
geom_boxplot() + labs(x = "ArrDel15", y = "Month")
ggplot(data = all_PIT_2006_clean, aes(x = as.factor(ArrDel15), y = Distance)) +
geom_boxplot() + labs(x = "ArrDel15", y = "Distance")
ggplot(data = all_PIT_2006_clean, aes(x = as.factor(ArrDel15), y = TaxiIn)) +
geom_boxplot() + labs(x = "ArrDel15", y = "TaxiIn")
ggplot(data = all_PIT_2006_clean, aes(x = as.factor(ArrDel15), y = DepDelay)) +
geom_boxplot() + labs(x = "ArrDel15", y = "DepDelay")
ggplot(data = all_PIT_2006_clean, aes(x = as.factor(ArrDel15), y = WheelsOff)) +
geom_boxplot() + labs(x = "ArrDel15", y = "WheelsOff")
ggplot(data = all_PIT_2006_clean, aes(x = as.factor(ArrDel15), y = TaxiOut)) +
geom_boxplot() + labs(x = "ArrDel15", y = "TaxiOut")
ggplot(data = all_PIT_2006_clean, aes(x = as.factor(ArrDel15), y = Quarter)) +
geom_boxplot() + labs(x = "ArrDel15", y = "Quarter")
WheelsOn with ArrDelay and as we saw earlier with ArrDel15, the later the Wheels are “on”, the greater the arrival delay.head(all_PIT_2006_clean)
## Year Quarter Month AirlineID UniqueCarrier Carrier FlightDate DayofMonth
## 1 2006 1 1 20355 US US 1/11/2006 11
## 2 2006 1 1 20355 US US 1/11/2006 11
## 3 2006 1 1 20355 US US 1/11/2006 11
## 4 2006 1 1 20355 US US 1/11/2006 11
## 5 2006 1 1 20355 US US 1/11/2006 11
## 6 2006 1 1 20355 US US 1/11/2006 11
## DayOfWeek Flights FlightNum TailNum ActualElapsedTime CRSElapsedTime AirTime
## 1 3 1 1607 N813MA 96 90 77
## 2 3 1 135 N114UW 98 109 81
## 3 3 1 139 N747UW 108 111 83
## 4 3 1 559 N433US 111 105 85
## 5 3 1 190 N745UW 73 85 59
## 6 3 1 258 N420US 74 81 57
## ArrDel15 ArrDel30 ArrDelSys15 ArrDelSys30 ArrDelay ArrTime ArrTimeBlk
## 1 0 0 0 0 1 1651 1600-1659
## 2 0 0 0 0 -7 1722 1700-1759
## 3 1 0 1 0 16 2012 1900-1959
## 4 0 0 0 0 0 750 0700-0759
## 5 0 0 0 0 -13 2242 2200-2259
## 6 0 0 0 0 -4 857 0900-0959
## CRSArrTime DepDel15 DepDel30 DepDelSys15 DepDelSys30 DepDelay DepTime
## 1 1650 0 0 0 0 -5 1515
## 2 1729 0 0 0 0 4 1544
## 3 1956 1 0 1 0 19 1824
## 4 750 0 0 0 0 -6 559
## 5 2255 0 0 0 0 -1 2129
## 6 901 0 0 0 0 3 743
## DepTimeBlk CRSDepTime Origin OriginCityName OriginState OriginStateFips
## 1 1500-1559 1520 BDL Hartford CT 9
## 2 1500-1559 1540 BOS Boston MA 25
## 3 1800-1859 1805 BOS Boston MA 25
## 4 0600-0659 605 BOS Boston MA 25
## 5 2100-2159 2130 CLT Charlotte NC 37
## 6 0700-0759 740 CLT Charlotte NC 37
## OriginStateName OriginWac Dest DestCityName DestState DestStateFips
## 1 Connecticut 11 PIT Pittsburgh PA 42
## 2 Massachusetts 13 PIT Pittsburgh PA 42
## 3 Massachusetts 13 PIT Pittsburgh PA 42
## 4 Massachusetts 13 PIT Pittsburgh PA 42
## 5 North Carolina 36 PIT Pittsburgh PA 42
## 6 North Carolina 36 PIT Pittsburgh PA 42
## DestStateName DestWac Distance DistanceGroup TaxiIn TaxiOut WheelsOff
## 1 Pennsylvania 23 406 2 5 14 1529
## 2 Pennsylvania 23 496 2 3 14 1558
## 3 Pennsylvania 23 496 2 5 20 1844
## 4 Pennsylvania 23 496 2 5 21 620
## 5 Pennsylvania 23 366 2 4 10 2139
## 6 Pennsylvania 23 366 2 5 12 755
## WheelsOn Cancelled CancellationCode Diverted CarrierDelay WeatherDelay
## 1 1646 0 0 0 0
## 2 1719 0 0 0 0
## 3 2007 0 0 0 0
## 4 745 0 0 0 0
## 5 2238 0 0 0 0
## 6 852 0 0 0 0
## NASDelay SecurityDelay LateAircraftDelay
## 1 0 0 0
## 2 0 0 0
## 3 0 0 16
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
delay.carrier <- all_PIT_2006_clean %>%
group_by(Carrier) %>%
summarize(count = n(),
totalArrDelay = sum(ArrDel15 > 0),
totalDepDelay = sum(DepDelay > 0),
numCarrierDelays = sum(CarrierDelay > 0),
numWeatherDelays = sum(WeatherDelay > 0),
numNASDelays = sum(NASDelay > 0),
numSecurityDelays = sum(SecurityDelay > 0),
numLateAircraftDelays = sum(LateAircraftDelay > 0))
# proportion of delays of each type out of the total number of flights that got delayed (by greater than 0 minutes, by carrier etc.)
ggplot(data = delay.carrier,
aes(x = as.factor(Carrier), y = numCarrierDelays/totalArrDelay)) +
geom_histogram(stat = "identity")
## Warning: Ignoring unknown parameters: binwidth, bins, pad
ggplot(data = all_PIT_2006_clean,
aes(x = WheelsOn, y = ArrDelay)) +
geom_point(stat = "identity")
# Checking which variables are important by plotting graphs and checking if there are any trends
ggplot(all_PIT_2006_clean,
aes(x = as.factor(Month),
y = ArrDelay)) +
geom_boxplot()
# Month also does not seem to have a significantly significant difference across the year.
ggplot(all_PIT_2006_clean,
aes(x = as.factor(Month),
y = WeatherDelay)) +
geom_boxplot()
# Month and WeatherDelay also seem to not be related (in a statistically significant way)
ggplot(all_PIT_2006_clean,
aes(x = ArrDelay,
y = CarrierDelay)) +
geom_point()
# ArrDelay and CarrierDelay clearly seem to be correlated. There is an upward trend that we can see
getMetrics <- function(table){
TP <- table["1", "1"]
FP <- table["1", "0"]
TN <- table["0", "0"]
FN <- table["0", "1"]
return(TP)
return(FP)
return(TN)
return(FN)
}
First, we read in both the datasets available to us. One that contains data about flights from 2006 and one that contains information about flights from both 2018 and 2019. Because of the sheer magnitude of the data, we decided to look at flights that arrived at Pittsburgh airport, and so we made a separate dataset that contained those rows for which the destination airport was PIT (Pittsburgh). We also decided to divide the training and testing dataset in a 80:20 ratio and used 20% of the total data (arrivals at pittsburgh airport) for testing and 80% for training.
Because the rows where our outcome or response variable (ArrDel15) was equal to 1, were quite low compared to the values where its value was equal to 0. So we decided to upsample our data by including 15000 such values to our original dataset so that the difference in number of rows having these two different values would decrease.
#all_2006 <- read.csv(file = "all_PIT_2006.csv", header = TRUE)
all_2018_2019 <- read.csv(file = "all_PIT_201803_201902.csv", header = TRUE)
all_2006 <- all_PIT_2006_clean
pit_idx <- which(all_2006$Dest == "PIT")
all_PIT_2006 <- all_2006[pit_idx,]
flights.more.idx <- sample(which(all_PIT_2006$ArrDel15 == "1"), 15000, replace = TRUE)
flights.upsample.2006 <- rbind(all_PIT_2006,
all_PIT_2006[flights.more.idx, ])
all_PIT_2006 <- flights.upsample.2006
# 2018 and 2019 data is only for testing. Not training
pit_idx_2018.2019 <- which(all_2018_2019$Dest == "PIT")
all_PIT_2018_2019 <- all_2006[pit_idx_2018.2019,]
# head(all_PIT_2006)
all_PIT_2006_clean <- na.omit(all_PIT_2006)
all_PIT_2018_2019_clean <- na.omit(all_PIT_2018_2019)
nrow(all_PIT_2006_clean)
## [1] 61666
nrow(all_PIT_2018_2019_clean)
## [1] 46580
test.indexes.2006 <- sample(1:nrow(all_PIT_2006_clean),
round(0.2 * nrow(all_PIT_2006_clean)))
train.indexes.2006 <- setdiff(1:nrow(all_PIT_2006_clean), test.indexes.2006)
training.data.pit <- all_PIT_2006_clean[train.indexes.2006,]
testing.data.pit <- all_PIT_2006_clean[test.indexes.2006,]
test.indexes.2018.2019 <- sample(1:nrow(all_PIT_2018_2019_clean),
round(0.2 * nrow(all_PIT_2018_2019_clean)))
train.indexes.2018.2019 <- setdiff(1:nrow(all_PIT_2018_2019_clean), test.indexes.2018.2019)
training.data.pit.2018.2019 <- all_PIT_2018_2019_clean[train.indexes.2018.2019,]
testing.data.pit.2018.2019 <- all_PIT_2018_2019_clean[test.indexes.2018.2019,]
We decided to build a model that would help people who come to pick up their relatives and friends from airports because we believe they suffer more than the travelers who are actually in a plane and can pass time using in-flight entertainment. But the people who come to pick them need to plan accordingly and arrive way before time. If these people could somehow know if the flight was being delayed, they would leave for the airport accordingly and do other important stuff. Since we’re talking about arrivals, we decided to use ArrDel15 (which is a categorical variable that has the value 1 when the flight is more than 15 mins late and 0 otherwise). Even though ArrDelay was another variable that could be used to predict delays, we did not move forward with it because we believed that smaller delays might not affect people that much and only when the delay was 15 mins or more did it start to become tiresome.
We also decided to discard some of the variables that we thought would not make sense to use as our predictor variables. These included variables like FlightNum, TailNum, AirlineID, Year (because it had the same value for all rows in the 2006 dataset) and even date etc. (should we mention all of them?)
We are going to make a vector of all the variables that we think are important in predicting the response variable and then use variable selection techniques taught in class to find the subset of variables that would actually be helpful for our model.
impVarVector <- c("Quarter", "Month", "Carrier",
"ActualElapsedTime", "AirTime", "Distance",
"ArrDel15", "DayOfWeek", "DayofMonth", "DepDelay", "WheelsOff")
impVarVecNumeric <- c("Quarter", "Month",
"ActualElapsedTime", "AirTime", "Distance",
"ArrDel15", "DayOfWeek", "DayofMonth", "DepDelay", "WheelsOff")
training.data.pit.subset <- training.data.pit[,impVarVecNumeric]
testing.data.pit.subset <- testing.data.pit[,impVarVecNumeric]
head(training.data.pit.subset)
## Quarter Month ActualElapsedTime AirTime Distance ArrDel15 DayOfWeek
## 2 1 1 98 81 496 0 3
## 3 1 1 108 83 496 1 3
## 4 1 1 111 85 496 0 3
## 6 1 1 74 57 366 0 3
## 7 1 1 77 60 366 0 3
## 8 1 1 70 59 366 0 3
## DayofMonth DepDelay WheelsOff
## 2 11 4 1558
## 3 11 19 1844
## 4 11 -6 620
## 6 11 3 755
## 7 11 1 1750
## 8 11 3 1555
# ArrDelayData = training.data.pit[,c(2, 3, 6, 10, 13, 15, 20, 44, 46,
# 47, 48, 49)]
Now that we have made a new dataset with the relevant variables, we decided to check their correlation with each other so we could deal with the highly correlated variables accordingly. We decided to use a threshold of 0.8 and if the correlation came out to be greater than 0.8, we considered those two predictor variables to be highly correlated and decided to drop one of those two to avoid the effect of one variable creeping into the other and making the results statistically insignificant.
corr <- cor(training.data.pit.subset)
ggcorrplot(corr, type = "lower", lab = TRUE)
high.corr <- as.data.frame(as.table(corr))
high.corr <- high.corr[(high.corr$Freq > 0.8 | high.corr$Freq < -0.8) &
high.corr$Var1 != high.corr$Var2, ]
high.corr <- na.omit(high.corr)
high.corr
## Var1 Var2 Freq
## 2 Month Quarter 0.9704198
## 11 Quarter Month 0.9704198
## 24 AirTime ActualElapsedTime 0.9517271
## 25 Distance ActualElapsedTime 0.9315724
## 33 ActualElapsedTime AirTime 0.9517271
## 35 Distance AirTime 0.9851318
## 43 ActualElapsedTime Distance 0.9315724
## 44 AirTime Distance 0.9851318
We can see from the table and correlation matrix above that there are a number of variables in the subset that we chose, that are highly correlated with each other (greater than 0.95!). We will now remove one of these variables and also add the Carrier variable that we removed earlier so that the correlation could be performed (can be done only on numeric variables)
varsToRemove <- c(1, 4, 5) # Quarter, ActualElapsedTime and AirTime
finalVars <- training.data.pit[impVarVector]
finalVarsTest <- testing.data.pit[impVarVector]
impVarVectorFinal <- finalVars[-varsToRemove]
impVarVectorTest <- finalVarsTest[-varsToRemove]
colnames(impVarVectorFinal)
## [1] "Month" "Carrier" "Distance" "ArrDel15" "DayOfWeek"
## [6] "DayofMonth" "DepDelay" "WheelsOff"
# head(impVarVectorFinal)
The variables mentioned above are the final list of variables that we have chosen after common sense and correlation methods. We will now apply methods learnt in class to further trim these down if that is possible.
glm_lexp= glm(ArrDel15 ~ .,data = impVarVectorFinal)
kable(coef(summary(glm_lexp)), digits = c(1,0,1,3))
| Estimate | Std. Error | t value | Pr(>|t|) | |
|---|---|---|---|---|
| (Intercept) | 0.1 | 0 | 8.2 | 0.000 |
| Month | 0.0 | 0 | 10.9 | 0.000 |
| CarrierCO | 0.0 | 0 | -2.1 | 0.032 |
| CarrierDL | 0.0 | 0 | 0.1 | 0.903 |
| CarrierEV | 0.1 | 0 | 5.9 | 0.000 |
| CarrierFL | 0.0 | 0 | -2.4 | 0.017 |
| CarrierMQ | 0.1 | 0 | 5.2 | 0.000 |
| CarrierNW | 0.0 | 0 | 1.2 | 0.236 |
| CarrierOH | 0.0 | 0 | -1.4 | 0.148 |
| CarrierOO | 0.0 | 0 | -2.6 | 0.009 |
| CarrierRU | 0.1 | 0 | 7.8 | 0.000 |
| CarrierUA | 0.0 | 0 | -1.8 | 0.080 |
| CarrierUS | 0.0 | 0 | -2.7 | 0.008 |
| CarrierWN | 0.0 | 0 | -3.8 | 0.000 |
| CarrierXE | 0.0 | 0 | 2.6 | 0.008 |
| CarrierYV | -0.1 | 0 | -7.4 | 0.000 |
| Distance | 0.0 | 0 | -8.5 | 0.000 |
| DayOfWeek | 0.0 | 0 | -2.6 | 0.009 |
| DayofMonth | 0.0 | 0 | 1.3 | 0.210 |
| DepDelay | 0.0 | 0 | 142.4 | 0.000 |
| WheelsOff | 0.0 | 0 | 30.1 | 0.000 |
coeffs <- coef(summary(glm_lexp))
statistically.significant <- coef(summary(glm_lexp))[,"Pr(>|t|)"] < 0.05
stat.sig.vars <- names(which(statistically.significant == TRUE))
stat.sig.vars
## [1] "(Intercept)" "Month" "CarrierCO" "CarrierEV" "CarrierFL"
## [6] "CarrierMQ" "CarrierOO" "CarrierRU" "CarrierUS" "CarrierWN"
## [11] "CarrierXE" "CarrierYV" "Distance" "DayOfWeek" "DepDelay"
## [16] "WheelsOff"
We see that three of the Carriers (namely DL, NW and UA) dont have highly statistically significant coefficients so we decided to apply feature engineering to make categorical variables (new columns) for the remaining carriers. Instead of giving each of these Carriers a numeric value we decided to make a new column for each statistically significant Carrier by assigning 1 if that row belonged to that particular Carrier and 0 otherwise. We did this for all Carriers that had statistically significant coefficients.
newImpVarVectorFinal <- impVarVectorFinal
newImpVarVectorFinalTest <- impVarVectorTest
newImpVarVectorFinal$CarrierCO <- ifelse(newImpVarVectorFinal$Carrier == "CO", 1, 0)
newImpVarVectorFinal$CarrierEV <- ifelse(newImpVarVectorFinal$Carrier == "EV", 1, 0)
newImpVarVectorFinal$CarrierFL <- ifelse(newImpVarVectorFinal$Carrier == "FL", 1, 0)
newImpVarVectorFinal$CarrierMQ <- ifelse(newImpVarVectorFinal$Carrier == "MQ", 1, 0)
newImpVarVectorFinal$CarrierNW <- ifelse(newImpVarVectorFinal$Carrier == "NW", 1, 0)
newImpVarVectorFinal$CarrierOH <- ifelse(newImpVarVectorFinal$Carrier == "OH", 1, 0)
newImpVarVectorFinal$CarrierOO <- ifelse(newImpVarVectorFinal$Carrier == "OO", 1, 0)
newImpVarVectorFinal$CarrierRU <- ifelse(newImpVarVectorFinal$Carrier == "RU", 1, 0)
newImpVarVectorFinal$CarrierUS <- ifelse(newImpVarVectorFinal$Carrier == "US", 1, 0)
newImpVarVectorFinal$CarrierWN <- ifelse(newImpVarVectorFinal$Carrier == "WN", 1, 0)
newImpVarVectorFinal$CarrierXE <- ifelse(newImpVarVectorFinal$Carrier == "XE", 1, 0)
newImpVarVectorFinalTest$CarrierCO <- ifelse(newImpVarVectorFinalTest$Carrier == "CO", 1, 0)
newImpVarVectorFinalTest$CarrierEV <- ifelse(newImpVarVectorFinalTest$Carrier == "EV", 1, 0)
newImpVarVectorFinalTest$CarrierFL <- ifelse(newImpVarVectorFinalTest$Carrier == "FL", 1, 0)
newImpVarVectorFinalTest$CarrierMQ <- ifelse(newImpVarVectorFinalTest$Carrier == "MQ", 1, 0)
newImpVarVectorFinalTest$CarrierNW <- ifelse(newImpVarVectorFinalTest$Carrier == "NW", 1, 0)
newImpVarVectorFinalTest$CarrierOH <- ifelse(newImpVarVectorFinalTest$Carrier == "OH", 1, 0)
newImpVarVectorFinalTest$CarrierOO <- ifelse(newImpVarVectorFinalTest$Carrier == "OO", 1, 0)
newImpVarVectorFinalTest$CarrierRU <- ifelse(newImpVarVectorFinalTest$Carrier == "RU", 1, 0)
newImpVarVectorFinalTest$CarrierUS <- ifelse(newImpVarVectorFinalTest$Carrier == "US", 1, 0)
newImpVarVectorFinalTest$CarrierWN <- ifelse(newImpVarVectorFinalTest$Carrier == "WN", 1, 0)
newImpVarVectorFinalTest$CarrierXE <- ifelse(newImpVarVectorFinalTest$Carrier == "XE", 1, 0)
We use all of these features and feed them into the forward stepwise model as well as backward stepwise model to see if it will further help trim down the variables and if they return the same results. As we can see, they both return the same variables except for two Carriers that are different. The other variables like Month, TaxiIn, Distance and DepDelay are selected by both of them. We did not do a best subsets selection because of the magnitude of our data. An exhaustive search would have taken a lot of time and so we decided to move forward with forward and backward stepwise models and saw that the results were very similar.
flights.subset.bw <- regsubsets(ArrDel15 ~ .,
data = newImpVarVectorFinal,
nbest = 1, # 1 best model for each number of predictors
nvmax = NULL, # NULL for no limit on number of variables
method = "backward", really.big = TRUE)
## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : 11 linear dependencies found
## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : nvmax reduced to 20
## Warning in rval$lopt[] <- rval$vorder[rval$lopt]: number of items to replace is
## not a multiple of replacement length
summary(flights.subset.bw)
## Subset selection object
## Call: regsubsets.formula(ArrDel15 ~ ., data = newImpVarVectorFinal,
## nbest = 1, nvmax = NULL, method = "backward", really.big = TRUE)
## 31 Variables (and intercept)
## Forced in Forced out
## Month FALSE FALSE
## CarrierCO FALSE FALSE
## CarrierDL FALSE FALSE
## CarrierEV FALSE FALSE
## CarrierFL FALSE FALSE
## CarrierMQ FALSE FALSE
## CarrierNW FALSE FALSE
## CarrierOH FALSE FALSE
## CarrierOO FALSE FALSE
## CarrierRU FALSE FALSE
## CarrierUA FALSE FALSE
## CarrierUS FALSE FALSE
## CarrierWN FALSE FALSE
## CarrierXE FALSE FALSE
## CarrierYV FALSE FALSE
## Distance FALSE FALSE
## DayOfWeek FALSE FALSE
## DayofMonth FALSE FALSE
## DepDelay FALSE FALSE
## WheelsOff FALSE FALSE
## CarrierCO FALSE FALSE
## CarrierEV FALSE FALSE
## CarrierFL FALSE FALSE
## CarrierMQ FALSE FALSE
## CarrierNW FALSE FALSE
## CarrierOH FALSE FALSE
## CarrierOO FALSE FALSE
## CarrierRU FALSE FALSE
## CarrierUS FALSE FALSE
## CarrierWN FALSE FALSE
## CarrierXE FALSE FALSE
## 1 subsets of each size up to 20
## Selection Algorithm: backward
## Month CarrierCO CarrierDL CarrierEV CarrierFL CarrierMQ CarrierNW
## 1 ( 1 ) " " " " " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " " " " "
## 3 ( 1 ) " " " " " " " " " " "*" " "
## 4 ( 1 ) " " " " " " " " " " "*" " "
## 5 ( 1 ) "*" " " " " " " " " "*" " "
## 6 ( 1 ) "*" " " " " " " " " "*" " "
## 7 ( 1 ) "*" " " " " " " " " "*" " "
## 8 ( 1 ) "*" " " " " "*" " " "*" " "
## 9 ( 1 ) "*" " " " " "*" " " "*" " "
## 10 ( 1 ) "*" " " " " "*" " " "*" " "
## 11 ( 1 ) "*" " " " " "*" " " "*" " "
## 12 ( 1 ) "*" " " " " "*" " " "*" " "
## 13 ( 1 ) "*" " " " " "*" "*" "*" " "
## 14 ( 1 ) "*" " " " " "*" "*" "*" " "
## 15 ( 1 ) "*" "*" " " "*" "*" "*" " "
## 16 ( 1 ) "*" "*" " " "*" "*" "*" " "
## 17 ( 1 ) "*" "*" " " "*" "*" "*" " "
## 18 ( 1 ) "*" "*" " " "*" "*" "*" "*"
## 19 ( 1 ) "*" "*" " " "*" "*" "*" "*"
## 20 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
## CarrierOH CarrierOO CarrierRU CarrierUA CarrierUS CarrierWN CarrierXE
## 1 ( 1 ) " " " " " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " " " " "
## 3 ( 1 ) " " " " " " " " " " " " " "
## 4 ( 1 ) " " " " "*" " " " " " " " "
## 5 ( 1 ) " " " " "*" " " " " " " " "
## 6 ( 1 ) " " " " "*" " " " " " " " "
## 7 ( 1 ) " " " " "*" " " " " " " " "
## 8 ( 1 ) " " " " "*" " " " " " " " "
## 9 ( 1 ) " " " " "*" " " " " " " "*"
## 10 ( 1 ) " " " " "*" " " " " "*" "*"
## 11 ( 1 ) " " " " "*" " " "*" "*" "*"
## 12 ( 1 ) " " "*" "*" " " "*" "*" "*"
## 13 ( 1 ) " " "*" "*" " " "*" "*" "*"
## 14 ( 1 ) " " "*" "*" "*" "*" "*" "*"
## 15 ( 1 ) " " "*" "*" "*" "*" "*" "*"
## 16 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
## 17 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
## 18 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
## 19 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
## 20 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
## CarrierYV Distance DayOfWeek DayofMonth DepDelay WheelsOff CarrierCO
## 1 ( 1 ) " " " " " " " " "*" " " " "
## 2 ( 1 ) " " " " " " " " "*" "*" " "
## 3 ( 1 ) " " " " " " " " "*" "*" " "
## 4 ( 1 ) " " " " " " " " "*" "*" " "
## 5 ( 1 ) " " " " " " " " "*" "*" " "
## 6 ( 1 ) " " "*" " " " " "*" "*" " "
## 7 ( 1 ) "*" "*" " " " " "*" "*" " "
## 8 ( 1 ) "*" "*" " " " " "*" "*" " "
## 9 ( 1 ) "*" "*" " " " " "*" "*" " "
## 10 ( 1 ) "*" "*" " " " " "*" "*" " "
## 11 ( 1 ) "*" "*" " " " " "*" "*" " "
## 12 ( 1 ) "*" "*" " " " " "*" "*" " "
## 13 ( 1 ) "*" "*" " " " " "*" "*" " "
## 14 ( 1 ) "*" "*" " " " " "*" "*" " "
## 15 ( 1 ) "*" "*" " " " " "*" "*" " "
## 16 ( 1 ) "*" "*" " " " " "*" "*" " "
## 17 ( 1 ) "*" "*" "*" " " "*" "*" " "
## 18 ( 1 ) "*" "*" "*" " " "*" "*" " "
## 19 ( 1 ) "*" "*" "*" "*" "*" "*" " "
## 20 ( 1 ) "*" "*" "*" "*" "*" "*" " "
## CarrierEV CarrierFL CarrierMQ CarrierNW CarrierOH CarrierOO CarrierRU
## 1 ( 1 ) " " " " " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " " " " "
## 3 ( 1 ) " " " " " " " " " " " " " "
## 4 ( 1 ) " " " " " " " " " " " " " "
## 5 ( 1 ) " " " " " " " " " " " " " "
## 6 ( 1 ) " " " " " " " " " " " " " "
## 7 ( 1 ) " " " " " " " " " " " " " "
## 8 ( 1 ) " " " " " " " " " " " " " "
## 9 ( 1 ) " " " " " " " " " " " " " "
## 10 ( 1 ) " " " " " " " " " " " " " "
## 11 ( 1 ) " " " " " " " " " " " " " "
## 12 ( 1 ) " " " " " " " " " " " " " "
## 13 ( 1 ) " " " " " " " " " " " " " "
## 14 ( 1 ) " " " " " " " " " " " " " "
## 15 ( 1 ) " " " " " " " " " " " " " "
## 16 ( 1 ) " " " " " " " " " " " " " "
## 17 ( 1 ) " " " " " " " " " " " " " "
## 18 ( 1 ) " " " " " " " " " " " " " "
## 19 ( 1 ) " " " " " " " " " " " " " "
## 20 ( 1 ) " " " " " " " " " " " " " "
## CarrierUS CarrierWN CarrierXE
## 1 ( 1 ) " " " " " "
## 2 ( 1 ) " " " " " "
## 3 ( 1 ) " " " " " "
## 4 ( 1 ) " " " " " "
## 5 ( 1 ) " " " " " "
## 6 ( 1 ) " " " " " "
## 7 ( 1 ) " " " " " "
## 8 ( 1 ) " " " " " "
## 9 ( 1 ) " " " " " "
## 10 ( 1 ) " " " " " "
## 11 ( 1 ) " " " " " "
## 12 ( 1 ) " " " " " "
## 13 ( 1 ) " " " " " "
## 14 ( 1 ) " " " " " "
## 15 ( 1 ) " " " " " "
## 16 ( 1 ) " " " " " "
## 17 ( 1 ) " " " " " "
## 18 ( 1 ) " " " " " "
## 19 ( 1 ) " " " " " "
## 20 ( 1 ) " " " " " "
flights.subset <- regsubsets(ArrDel15 ~ .,
data = newImpVarVectorFinal,
nbest = 1, # 1 best model for each number of predictors
nvmax = NULL, # NULL for no limit on number of variables
method = "forward", really.big = TRUE)
## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : 11 linear dependencies found
## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : nvmax reduced to 20
## Warning in rval$lopt[] <- rval$vorder[rval$lopt]: number of items to replace is
## not a multiple of replacement length
print("Coefficients selected by backward stepwise model: ")
## [1] "Coefficients selected by backward stepwise model: "
names(coef(flights.subset.bw, which.min(summary(flights.subset.bw)$bic)))
## [1] "(Intercept)" "Month" "CarrierEV" "CarrierMQ" "CarrierRU"
## [6] "CarrierUS" "CarrierWN" "CarrierXE" "CarrierYV" "Distance"
## [11] "DepDelay" "WheelsOff"
print("Coefficients selected by forward stepwise model: ")
## [1] "Coefficients selected by forward stepwise model: "
names(coef(flights.subset, which.min(summary(flights.subset)$bic)))
## [1] "(Intercept)" "Month" "CarrierDL" "CarrierMQ" "CarrierNW"
## [6] "CarrierRU" "CarrierWN" "CarrierYV" "Distance" "DepDelay"
## [11] "WheelsOff" "CarrierEV" "CarrierXE"
We plotted RSS (Residual sum of squares), R-squared, BIC and AIC with the number of variables to see how each of these metrics changed with the number of variables, and we were not surprised to see that RSS decreased as more and more variables were included. Similarly, R-squared continued to increase as more and more variables were included because more of the variation in the response variable (ArrDel15) was being explained by the model. The AIC curve slightly surprised us because its lowest value came out to be at 19 variables but this was probably because AIC does not penalize an increase in the number of variables to a great extent. We see in the BIC curve that the model selected a total of 10 variables which is good because BIC penalizes an increase in the number of variables to a greater extent and thus makes the model simpler and less prone to overfitting. This is why we decided to move forward with minimizing the Bayesian Information Classification (BIC) for our variable selection.
flights.summary<-summary(flights.subset)
num_variables<-seq(1,length(flights.summary$rss))
plot_RSS<-ggplot(data = data.frame(flights.summary$rss),
aes(x=num_variables,y=flights.summary$rss))+
geom_line()+
geom_point(x=which.min(flights.summary$rss),
y=min(flights.summary$rss),aes(color="red"),
show.legend = FALSE)+
xlab("# Variables")+
ylab("RSS")+
theme_bw()
plot_R_sq<-ggplot(data = data.frame(flights.summary$rsq),
aes(x=num_variables,y=flights.summary.rsq))+
geom_line()+
geom_point(x=which.max(flights.summary$rsq),
y=max(flights.summary$rsq),aes(color="red"),
show.legend = FALSE)+
xlab("# Variables")+
ylab("R-sq")+
theme_bw()
plot_BIC<-ggplot(data = data.frame(flights.summary$bic),
aes(x=num_variables,y=flights.summary.bic))+
geom_line()+
geom_point(x=which.min(flights.summary$bic),
y=min(flights.summary$bic),aes(color="red"),
show.legend = FALSE)+
xlab("# Variables")+
ylab("BIC")+
theme_bw()
plot_AIC<-ggplot(data = data.frame(flights.summary$cp),
aes(x=num_variables,y=flights.summary.cp))+
geom_line()+
geom_point(x=which.min(flights.summary$cp),
y=min(flights.summary$cp),aes(color="red"),
show.legend = FALSE)+
xlab("# Variables")+
ylab("AIC")+
theme_bw()
grid.arrange(plot_RSS, plot_R_sq,plot_AIC,plot_BIC, ncol=2,nrow=2)
Next we visualize the variables that have minimized the BIC. We only did this for the backward stepwise model because both backward and forward came out to be similar and backward stepwise had more variables so we wanted to include a larger subset of variables.
plot(flights.subset.bw,scale="bic")
We will now run the analysis of variable selection using the lasso
# Using the lasso now to do variable selection
xs<-model.matrix(ArrDel15 ~.,newImpVarVectorFinal)[,-1]
y<-newImpVarVectorFinal$ArrDel15
#alpha=1 is a lasso penalty!
flights.lasso<-glmnet(xs,y,alpha=1)
coef(flights.lasso,s=0.1) # can change this lambda value
## 32 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 0.336067832
## Month .
## CarrierCO .
## CarrierDL .
## CarrierEV .
## CarrierFL .
## CarrierMQ .
## CarrierNW .
## CarrierOH .
## CarrierOO .
## CarrierRU .
## CarrierUA .
## CarrierUS .
## CarrierWN .
## CarrierXE .
## CarrierYV .
## Distance .
## DayOfWeek .
## DayofMonth .
## DepDelay 0.004414527
## WheelsOff .
## CarrierCO .
## CarrierEV .
## CarrierFL .
## CarrierMQ .
## CarrierNW .
## CarrierOH .
## CarrierOO .
## CarrierRU .
## CarrierUS .
## CarrierWN .
## CarrierXE .
plot(flights.lasso, label = TRUE)
The model becomes more complex as we move from left to right i.e. increase the number of variables in the model.
cv.out=cv.glmnet(xs,y,alpha=1)
plot(cv.out)
We use the 1 se lambda value to get the coefficients that are not 0 at that value. Although we decided to use multiple approaches to solving the problem of variable selection, we are most confident about the results from LASSO (the coefficients are a subset of those from backward stepwise selection except for a few), because LASSO uses a tuning parameter to penalize the number of parameters and this prevents overfitting. And we use cross-validation to find the 1 se lambda value to get the coefficients which makes us feel more confident about LASSO’s selection of the coefficients. Some coefficients were not included, like CarrierYV because they did not come out to be statistically significant when we tested them initially.
one.se.lambda <- cv.out$lambda.1se
lasso.coef = predict(flights.lasso, type ="coefficients", s = one.se.lambda)[1:21,]
# lasso.coef[lasso.coef != 0]
one.se.lambda.coeffs <- names(lasso.coef[lasso.coef != 0])[names(lasso.coef[lasso.coef != 0]) != "(Intercept)"]
one.se.lambda.coeffs
## [1] "Month" "CarrierEV" "CarrierMQ" "CarrierRU" "CarrierUS" "CarrierWN"
## [7] "CarrierXE" "CarrierYV" "Distance" "DepDelay" "WheelsOff"
Now that we have the relevant coefficients, we train a model that will help us predict whether there will be a 15 min delay or not. This will essentially help the people who will be coming to pick up their relatives from the airport. We plan on training multiple models and see which one performs the best on the test dataset. We first train a decision tree. It would help us decide the path of variable values that lead to an arrival delay of greater than 15 minutes (1) or not (0).
columns.to.select <- c(1, 10, 12, 16, 13, 17, 18, 19, 3, 7, 8, 4)
newImpVarVectorFinal <- newImpVarVectorFinal[,columns.to.select] # UNCOMMENT LINE!!!
newImpVarVectorFinalTest <- newImpVarVectorFinalTest[,columns.to.select] # UNCOMMENT LINE FOR FINAL RUN
flight.tree <- rpart(formula = ArrDel15 ~ ., data=newImpVarVectorFinal,
control = rpart.control(minsplit=100, cp=0.002))
fltree <- as.party(flight.tree)
plot(fltree, gp = gpar(fontsize = 8))
plotcp(flight.tree)
min.cv.idx <- which.min(flight.tree$cptable[,"xerror"])
# min CV error + 1se (this is the height of the horizontal bar)
min.cv.err.1se <- flight.tree$cptable[min.cv.idx,"xerror"] +
flight.tree$cptable[min.cv.idx,"xstd"]
# Which cp values produce models whose error is below min CV + 1se?
candidate.cp.vals <- flight.tree$cptable[which(flight.tree$cptable[,"xerror"] < min.cv.err.1se),"CP"]
# 1-SE rule value of cp
cp.1se <- max(candidate.cp.vals)
print("1 se value of cp that we use for pruning: ")
## [1] "1 se value of cp that we use for pruning: "
print(round(cp.1se,5))
## [1] 0.00212
kable(flight.tree$cptable, digits=5, format="markdown")
| CP | nsplit | rel error | xerror | xstd |
|---|---|---|---|---|
| 0.52363 | 0 | 1.00000 | 1.00003 | 0.00144 |
| 0.02481 | 1 | 0.47637 | 0.47728 | 0.00455 |
| 0.02032 | 2 | 0.45157 | 0.45172 | 0.00422 |
| 0.00365 | 3 | 0.43124 | 0.43248 | 0.00406 |
| 0.00286 | 4 | 0.42759 | 0.42319 | 0.00396 |
| 0.00212 | 7 | 0.41862 | 0.41913 | 0.00392 |
| 0.00200 | 8 | 0.41650 | 0.41804 | 0.00388 |
The tree above gives several important insights. Firstly, we can see that there are 15 terminal nodes. Next we can map out many different paths. If the departure delay is greater than or equal to 14.5 minutes, we see that we can solely use the departure delay variable to predict if there will be an arrival delay of greater than 15 minutes or not. This makes sense, because if the flight departed later than the actual time then it would probably land later than the set arrival time. What was interesting to note was that there were cases when the flight departed more than 15 minutes late and still did not get classified as being an arrival delay of more than 15 minutes (when departure delay is greater than 21.5 minutes) Secondly, if the departure delay was between 2.5 minutes and 14.5 minutes, then again Departure delay in minutes was the single most important variable to decide if there would be a delay of 15 minutes or more. If teh departure delay was less than 2.5 minutes, then there were a number of predictors like distance and wheels off time that determined whether the flight would arrive 15 minutes late or not.
We will now Prune the tree to see if the prediction error stays almost the same with a less complex tree, using the cp value that we found in the previous part. Pruning helps us decide which path is the most important and prevents overfitting of the data because it “prunes” branches that are not very necessary.
flight.pruned <- prune(flight.tree, cp = cp.1se)
predict.pruned.flights <- predict(flight.pruned, newdata=newImpVarVectorFinalTest)
cutoff <- 0.50
predictedVals <- ifelse(predict.pruned.flights > cutoff, 1, 0)
conf.table <- table(preds = predictedVals, actual = newImpVarVectorFinalTest$ArrDel15)
# metrics <- getMetrics(conf.table)
misClassRate = mean(predictedVals != newImpVarVectorFinalTest$ArrDel15)
misClassRate
## [1] 0.1287602
TP <- conf.table["1", "1"]
FP <- conf.table["1", "0"]
TN <- conf.table["0", "0"]
FN <- conf.table["0", "1"]
The pruned tree gave a misclassification rate of 0.13 which turns out to be quite good. But what will be more informational is the proportion of values that we need. For the population that we have in mind (people who are coming to pick up their relatives from the airport), it is most important that our model has a high True positive rate or Sensitivity (so that they are able to correctly predict when there might be a delay and there actually is), a high true negative rate (to predict correctly when there might not be a delay and there isnt). Another very important metric is to minimize as much as possible, the number of cases when the person predicts there would not be an arrival delay but there is because this will lead to further inconvenience for the person. This is the false negative rate.
spec.table <- as.data.frame(cbind("Pruned tree: ",
"Sensitivity " = round(TP/(TP+FN),2),
"Specificity " = round(TN/(TN+FP),2),
"False negative rate " = round(FN/(FN+TP),2),
"Accuracy " = 1-round(misClassRate,2),
"Misclassification rate " = round(misClassRate,2)))
The table above shows the different metrics for the Pruned tree. We see that it performs fairly well on the given data set.
We now train an LDA model to see if LDA would be able to differentiate between the binary categorical variable of Arrivals being greater than 15 minutes or not. The reason we can use LDA is because the response variable is binary and LDA basically uses a straight line to discriminate between the two values for the response variable. Hence we use this too.
flight.delay.15.lda <- lda(ArrDel15 ~ ., data = newImpVarVectorFinal)
# plot(flight.delay.15.lda, col = as.numeric(newImpVarVectorFinal$ArrDel15)) # what does this plot tell us. Iris dataset wala homework 4.
confusion.lda <- table(predict(flight.delay.15.lda)$class,
newImpVarVectorFinal$ArrDel15)
TP.lda <- confusion.lda["1", "1"]
FP.lda <- confusion.lda["1", "0"]
TN.lda <- confusion.lda["0", "0"]
FN.lda <- confusion.lda["0", "1"]
spec.table.lda <- as.data.frame(cbind("LDA: ",
"Sensitivity " = round(TP.lda/(TP.lda+FN.lda),2),
"Specificity " = round(TN.lda/(TN.lda+FP.lda),2),
"False negative rate " = round(FN.lda/(FN.lda+TP.lda),2),
"Accuracy " = round(sum(diag(confusion.lda)) / sum(confusion.lda),2),
"Misclassification rate " = round(1 - sum(diag(confusion.lda)) / sum(confusion.lda),2)))
comb.table <- rbind(spec.table, spec.table.lda)
comb.table
## V1 Sensitivity Specificity False negative rate Accuracy
## 1 Pruned tree: 0.76 0.95 0.24 0.87
## 2 LDA: 0.56 0.99 0.44 0.81
## Misclassification rate
## 1 0.13
## 2 0.19
We can see that although LDA performs slighty better for specificity, it does so at the expense of a large amount of sensitivity and false negative rate (both of which are important for us). Hence out of these two, we decide that a Pruned Tree is better than LDA because it helps us maximize or minimize metrics that are actually important to us. (Like maximize the sensitivity and specificity and minimize the false negative rate)
We now train a random forest model on the given data from 2006. A random forest is an ensemble learning method that we are using for our classification task. With random forests, we’re averaging over a bunch of bagged trees, and each tree is built by considering a small random subset of predictor variables at each split. This leads to a model that is very difficult to interpret. Random forests are very flexible and do not overfit the data. A random forest leads to a tree with high variance and low bias.
So we use the variable importance plot to understand what they are saying. In our case, the variable importance plot suggests that firstly, all of the different carrier variables are not at all important. Removing them does not affect the node purity at all. The node impurity is a measure of the homogeneity of the labels at the node. And we could see this with the tree that we showed earlier because Carrier was not present at any of the nodes. What we do see is that DepDelay (Departure delay in minutes) is the most important variable and removing this variable has the most effect on node purity. This is followed by the variables WheelsOff, Distance and Month.
# Random forests
classification.data.flights.train = newImpVarVectorFinal#[1:10000,] # comment this line for final analysis. Was taking too much time to run with the original dataset.
classification.data.flights.test = newImpVarVectorFinalTest
#newImpVarVectorFinal[6000:8000,] #comment this line too. Used this for testing
flights.rf <- randomForest(ArrDel15 ~ .,
data = classification.data.flights.train,
type = "response")
## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values. Are you sure you want to do regression?
var.imp <- varImpPlot(flights.rf) # To find the most important variables
most.imp.vars <- rownames(var.imp)[order(var.imp, decreasing = TRUE)]
rf.test.prob <- predict(flights.rf, newdata = classification.data.flights.test, type = "response")
rf.test.prob.c = ifelse(rf.test.prob > 0.5, 1, 0)
table(pred=rf.test.prob.c,actual=classification.data.flights.test$ArrDel15)
## actual
## pred 0 1
## 0 6875 909
## 1 301 4248
roc.rf <- roc(classification.data.flights.test$ArrDel15, rf.test.prob)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc.rf)
roc.rf$auc
## Area under the curve: 0.9713
We now train a boosted tree model
# Boosted trees and ROC curves
# classification.data.flights.train$Carrier = as.factor(classification.data.flights.train$Carrier)
# classification.data.flights.train$OriginCityName = as.factor(classification.data.flights.train$OriginCityName)
flights.boost=gbm(ArrDel15 ~ ., data = classification.data.flights.train, distribution = "bernoulli", shrinkage = 0.1,
n.trees=10000, interaction.depth=4, cv.folds = 5)
best.ntrees = gbm.perf(flights.boost, method = "cv")
pred.boost = predict(flights.boost, newdata=classification.data.flights.test, n.trees=best.ntrees, type="response")
roc.boost <- roc(classification.data.flights.test$ArrDel15, pred.boost)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc.boost, col = "red")
plot(roc.rf, col = "blue", add = TRUE)
roc.boost$auc
## Area under the curve: 0.9769
# Plotting the cv error with the number of trees on the x-axis
qplot(1:10000, flights.boost$cv.error, xlab = "Number of trees")
As CV decreases,
# K-means
classification.data.flights.train <- classification.data.flights.train[,-2]
classification.data.flights.train <- classification.data.flights.train[,-8:-22]
flights.scaled<-scale(classification.data.flights.train)
km.2 <- kmeans(flights.scaled, 2, nstart=20)
km.3 <- kmeans(flights.scaled, 3, nstart=20)
km.4 <- kmeans(flights.scaled, 4, nstart=20)
km.5 <- kmeans(flights.scaled, 5, nstart=20)
fviz_cluster(km.2, data=flights.scaled)